####################################### DATA ####################################### sessionInfo() # list of packages currently attached library() # see packages currently installed library(help="datasets") # Information on package 'datasets' data(package="datasets") # get info on data in the 'dataset' package AirPassengers # data set AirPassenger from the 'dataset' # package is immediately available # ATTENTION: # discrepancies between R version and package # version might require to load the data first # use "data(NAME.DATA)" for version >= 2.0.0 data(package=.packages(all.available = TRUE)) # get info on data from all the data sets # in packages already installed data() # list of data sets (mostly data frames) in all # packages that are attached to the session demo(graphics) # see some of the possibilities that R offers q() # quit current session and save workspace ############### Importing/Exporting Data in R ####################################### my.hills<-read.table("hills.txt",header=TRUE) # read a text file into data frame head(my.hills, n=3) # same as 'hills' from MASS Year<-c(1800,1850,1900,1950,2000) Carbon<-c(8,54,534,1630,6611) write.table( # write a data frame into a csv file data.frame(year=Year,carbon=Carbon), # most spreadsheet can export/import csv files file="fuel.csv",row.names=F,sep=",") fossifuel<-read.csv("fuel.csv") # reverse action fossifuel # return a data frame storing the data fossifuel<-read.table("fuel.csv", # the same header=TRUE,sep=",") # read a text file with columns separated fossifuel # by commas into a data frame write(Carbon,file="Carbon.txt") # write a vector into a text file scan("Carbon.txt") # read a text file into a vector matrix(1:10,ncol=2) write(matrix(1:10,ncol=2),file="x.mat.txt") # write a matrix into a text file, # ATTENTION:columns become rows scan("x.mat.txt") # ATTENTION: it creates a vector matrix(scan("x.mat.txt"),ncol=2) # to get back a matrix use 'matrix()' x<-c(1) data.entry(x) # spreadsheet interface for editing data a.df<-data.frame() # dummy data frame fix(a.df) # make permanent changes to a.df a.df edit(a.df) # 'edit' does not change the object 'a.df' # instead, a copy is made and that copy # is changed ############### Demo: Simple Data Analysis ############################################# library(MASS) # attach the package MASS # all data set of MASS are now accessible hills # display the contents of the data frame "hills" average<-sapply(hills,mean) # arithmetic means of the three columns of data average # print the content of the object 'average' range(hills$time) # range of the values in the vector 'time' plot(time ~ dist, data=hills, pch=16) # plot the data out<-lm(time ~ dist, data=hills) # fit the simple linear model: # time=b_0+b_1*dist+error summary(out) # show results of regression plot(out) # display of regression fitting and diagnostics b<-out$coeff # estimates of coefficients plot(time ~ dist, data=hills, pch=16) abline(b[1],b[2],col=2) # actual data vs fitted regression line # first argument = intercept, # second argument = slope points(average[1],average[3], pch=17,col=2,cex=1.5) # plot average values of 'dist' and 'time' abline(v=average[1],lty=2) ## v = vertical line at mean(dist), lty tratteggita abline(h=average[3],lty=2) ## h = horizontal line at mean(time) ############### Vectors ####################################### coeffs<-c(1,-3,2) # vector containing the coefficients of the # equation x^2-3x+2=0 class(coeffs) # class of the object "coeffs" length(coeffs) # dimension of the vector names(coeffs) # names of the elements of the vector names(coeffs)<-c("a","b","c") # modified names class(c("a","b","c")) # class is character coeffs # display all elements of 'coeffs' coeffs[2] # display 2nd element of 'coeffs' coeffs["b"] # the same discr<-coeffs[2]^2-4*coeffs[1]*coeffs[3] # discriminant of the quadratic equation b^2-4ac discr>0 # b^2-4ac>0 gives a logical value x<-seq(0,3,length=200) ## regular spaced points as values of x y<-coeffs[1]*x^2+coeffs[2]*x+coeffs[3] # images of x under the quadratic function # y=f(x)=a*x^2+b*x+c plot(x,y,type="l",ylab=expression(x^2-3*x+2)) # plot as a line abline(h=0,lty=2) # intersections with the first axis # are solutions of the equation seq(0,3,length=200) # regular spaced points = sequenza da 0 a 3 di 200 punti -2:2 # sequence of integers x<-vector("numeric",length=100) # create an empty numeric vector of 100 c(x,x) # combine two vectors ax<- 1; bx<- -3; cx<- 2 ax-bx+2*cx # [1] 8 bx/bx*cx/ax+bx # [1] -1 (bx/bx)*(cx/ax)+bx # better bx^cx # -3 raised to 2 log(cx); log(cx,exp(1)); log(x=cx,base=exp(1))# all the same, log(2) x<-cx/(-bx) # [1] 0.6666... round(x,2); trunc(x); ceiling(x); floor(x) ## x<-seq(1,3,length=21) # [1] 1 1.1 1.2 ... 2.8 2.9 3 min(x); max(x); mean(x) sum(x); prod(x) cumsum(x) cumprod(x) sum(x)/length(x) # arithmetic mean, equal to mean(x) prod(1:5) ## factorial factorial(1:5) ## factorial sum((x-mean(x))^2)/(length(x)-1) # sample variance var(x) # sample variance ############### Logic ####################################### x<- 1:5 # [1] 1 2 3 4 5 x < 5 # is x less than 5? x > 1 # is x bigger than 5? x > 1 & x < 5 # is x bigger than 1 and less than 5? x > 1 && x < 5 # first one is false x > 1 | x < 5 # is x bigger than 1 or less than 5? x > 1 || x < 5 # first one is true x == 3 # is x equal to 3? x != 3 # is x not equal to 3? !x == 3 # not (x equal to 3) !(x == 3) # better A<- x > 1 B<- x < 5 !( (!A) | (!B) ) # De Morgan A & B A + 1 # logical vector in ordinary arithmetic sum(A) # count numbers of TRUE in A any(A) # is there any TRUE in A sum(A) > 1 # the same all(A) # are all elements of A TRUE sum(A) == length(A) # the same prod(A) == 1 # the same which(B) # which elements of B are TRUE ############### Character vectors and factors ####################################### A<- c("beer","beer","wine","water") B<- c("beer","wine") # two vectors of class character A table(A) # finds the unique values and tabulates the frequencies of their occurrences union(A,B) # no duplicates value intersect(A,B) setdiff(A,B) drinks<-factor(A) # coerce a character vector to a factor drinks # it is used to store categorical data # with a specific set of values called levels table(drinks) # operates the same unclass(drinks) # stored as integer vector levels(drinks) # returns the unique values (useful) names(table(A)) # the same alcoholic<-factor(B) union(drinks,alcoholic) # operates the same intersect(drinks,alcoholic) beer<-c(3,4,1,1,3,4,3,3,1,2,1,2) # daily consumption of 12 peaple wine<-c(2,0,1,4,2,0,1,2,1,2,0,2) # of beer and wine glasses table(beer,wine) # table() by default converts numeric vectors to factors PIPPO<-round( table(beer,wine)/length(beer),2 ) # divide by 12, round to the second digit sum(PIPPO) ############### Data frames ####################################### #write.table(Cars93.summary,file="Cars93.summary.txt",row.names=T,sep=" ") Cars93.summary<-read.table("Cars93.summary.txt",header=TRUE) Cars93.summary # all data set of DAAG are in the work space head(Cars93.summary, n=3) # display the first three rows Cars93.summary[1:3, ] # the same rownames(Cars93.summary) # row names names(Cars93.summary) # variable (column) names summary(Cars93.summary) # simple summary statistics: min, max, quartiles # and mean for numeric vectors; levels and # frequencies for factors. data.frame( X1=1:10, X2=I(letters[1:10]), # create a data frame X3=factor(letters[1:10]) ) data.frame( 1:10, I(letters[1:10]), # R attemps to pick row and column names from factor(letters[1:10]) ) # the constituent vectors my.data<-data.frame( 1:10, I(letters[1:10]), factor(letters[1:10]) ) names(my.data)<-c("X1","X2","X3") # assign the names to variables my.data Min.passengers # error, object not found Cars93.summary$Min.passenger # it is now accessed as a vector attach(Cars93.summary) # attach data frame Min.passengers # now the object is accessed by its name class(abbrev) # 'abbrev' is a vector of class factor detach(Cars93.summary) # detach data frame library(MASS) # attach MASS package data(package="MASS") # display dataset in package = cosa c'e' dentro head(Cars93,n=3) # display the first three rows of 'Cars93' Cars93[1:3,c(3,18)] # "Type" and "Passengers" are the 3rd and 18th # variables of the data frame Cars93 class(Cars93$Type) # factor attach(Cars93) # attach data frame Cars93$Type Type aggregate(Passengers,by=list(Type),min) # reproduce first variable of 'Cars93.summary' aggregate(by=list(Type),Passengers,min) # reproduce first variable of 'Cars93.summary' aggregate(by=list(Type),Passengers,sum) # reproduce first variable of 'Cars93.summary' aggregate(Passengers,by=list(Type),max) # reproduce first variable of 'Cars93.summary' Cars93.summary # check detach(Cars93) # detach data frame my.Cars<-stack(Cars93.summary, select= 1:2) # ACCODAMENTO - concatenates selected columns into a single my.Cars # column named "values", and add a factor # named "ind" that has names of the concatenated # columns as levels unstack(my.Cars) # reverse the stacking operation library(MASS) head(hills,n=3) # back to "hills" data from MASS package hills$time<-round(hills$time*60) # express time in minutes (no decimal) hills$time # display new version str(hills) lapply(hills,max) # maximum of each columns as a list sapply(hills,max) # the same as vector (see 'help(lapply)') science<-read.table("science.txt",header=TRUE) science[!complete.cases(science),] # Which rows have missing values in the data # frame science (DAAG)? dim(science) # 1385 observations (rows) Science<- na.omit(science) # Omit any rows that contain missing values dim(Science) # 1383 observations ############### Lists ####################################### x.lis<-list(a=1:10, b=letters[1:3], # create a list d=factor(rep(letters[1:4],2))) # 'rep(x,n)' repeat the values in 'x' n times x.lis x.lis$a # object 1 is numeric x.lis[[2]] # object 2 is of class 'character' table(x.lis$d) # form a table for object 3, a factor e<-list(e=10:1) c(x.lis,e) # combine two lists ############### Matrices ####################################### x.mat<-matrix(1:12,nrow=3,ncol=4) # create a matrix x.mat # display 'x.mat' dim(x.mat) # 'x.mat' is a 3x4-matrix dimnames(x.mat)<-list(paste("row",1:3,sep=""),# assign names to rows and columns paste("col",1:4,sep="")) # 'paste(x,y)' concatenates elementwise # 'x' and 'y' after converting to character x.mat matrix( paste( # a 4x2-matrix of character arguments paste("entry",rep(1:4, 2),sep=" "), # look at the combined use of 'paste' and 'rep' rep(1:2, each=4),sep=","), # with 'rep(x,each=n)' each element of 'x' nrow=4,ncol=2) # is repeated n times. xx<-cbind(x.mat,x.mat) # combine x with itself by columns xx # xxx<-rbind(x.mat,x.mat) # combine x with itself by columns xxx # rbind(xx,xxx) # Error: the dimensions are not compatible Year<-c(1800,1850,1900,1950,2000) # create vector of time points (in years) Carbon<-c(8,54,534,1630,6611) # production level for each year plot(Year,Carbon,pch=16,cex=1.5) # plot Carbon against Year # ('pch' is the point character, 'cex' is the # size of the point) fossifuel<-data.frame(year=Year,carbon=Carbon)# create a dataframe fossifuel # fossifuel.mat<-matrix(c(Year,Carbon),nrow=5) # create a matrix, colnames(fossifuel.mat)<-c("year","carbon") # assign names to columns fossifuel.mat # compare to data frame (no rows names) cbind(Year,Carbon) # the same as 'fossifuel.mat', but # column names are automatically assigned ############### Indexing ####################################### x<- 1:10 # [1] 1 2 ... 10 names(x)<- letters[x] # assign names as "a", "b", ... , "j" x[1:3] # elements 1,2,3 x[c(1,10)] # elements 1 and 10 x[c(-1,-2)] # remove 1 and 2 x[x>5] # elements > 5 x[c("a","d")] # elements 1 and 4 x.mat<- matrix(1:100, ncol=10) # create a (10,10)-matrix x.mat[1:5,] # first 5 rows, all columns x.mat[1:4,x[x<3]] # check this ############### Computation ####################################### x.mat<- matrix(1:10, ncol=2) # create a (5,2)-matrix x.mat x.mat + 1 # sum by a scalar (recycling rule) x.mat + x.mat # sum of two matrices x.mat %*% t(x.mat) # matrix product, t() is the transpose function x.mat %*% x.mat # Error: non comformable for multiplication X<- matrix(sample(1:10,24,replace=T),ncol=6) # sample 24 times from 1,...,10 with replacement # then fill the values in a (4,6)-matrix dim(X) y<-c(-1,0.5,0.3,0.2) # 4-dimensional vector t(X) %*% y # matrix product X'y, linear combination of the # rows of X crossprod(X,y) # the same, more efficient crossprod(X) # matrix product X'X crossprod(t(X)) # matrix product XX', same as 'tcrossprod(X)' A<-crossprod(t(X)) # symmetric (4,4)-matrix v<-eigen(A);v # list with eigenvalues and eigenvectors of A # 'v$values' and 'v$vectors' det(A); prod(v$values) # det(A)=prod_i(eigenvalues(A)_i) crossprod(v$vectors[,2]) # eigenvectors are normalized crossprod(v$vectors[,1],v$vectors[,2]) # the matrix formed by concatenating # eigenvectors is orthogonal # (almost: R usenumerical routined for # approximating the eigenvectors) solve(A) # inverse of A A%*%solve(A) # AA^(-1)= identity matrix round(A%*%solve(A),2) # it is an approximation by numerical routines P<-v$vectors;P # matrix with normalized eigenvectors as columns apply(P,1,sum) # sum row-wise apply(P,2,sum) # sum column-wise apply(P,2,crossprod) # simultaneous check eigenvectors are normalized ############### More on Linear Algebra: decomposition of A ####################################### Lambda<-diag(v$values) # diagonal matrix with eigenvalues inv.Lambda<-diag(v$values^(-1)) # inverse of a diagonal matrix inv.A<-P%*%inv.Lambda%*%t(P) # inverse(A)= P*Lambda^{-1}P' inv.A;solve(A) # check # ATTENTION: A must be positive definite! square.root.A<-P%*%sqrt(Lambda)%*%t(P) # A^{1/2}=P*Lambda^{1/2}*P' # 'sqrt' operates element-wise square.root.A%*%square.root.A;A # check B<-P%*%sqrt(Lambda) # decomposition A=BB' crossprod(t(B)); A # check C<-chol(A) # choleski decomposition A=C'C C # with C upper triangular crossprod(C); A # check ############### Graphics in R ####################################### primates<-read.table("primates.txt",header=TRUE) attach(primates) plot(Brainwt~Bodywt, data=primates) # Place labels on points and axes plot(Brainwt~Bodywt, xlim=c(0, 300), # 'xlim' sets the range of x-axis xlab="Body weight (kg)", # 'xlab' and 'ylab' set the axes labels ylab="Brain weight (g)", pch=16) # 'pch' controls the plotting symbol text(Brainwt ~ Bodywt, # pick labels from row names labels=row.names(primates), pos=4) # 'pos=4', where 1: below, 2: to the left, # 3: above, 4: to the right mtext("Primates data", 3, line=1, cex=1.5) # add text to the margin, leaving 1-line space # 'side=3', where 1: $x$-axis, 2: $y$-axis, # 3: top, 4: right-vertical axis detach(primates) ############### The use of color ####################################### theta <- (1:50)*0.92 # 1st coordinate is in angle (radiants) # DIFFERENT IN THE SCRIPT plot(theta, sin(theta), col=1:50, # plot *sin(theta)' (elementwise) against pch=16, cex=4) # 'theta' with points of different colors # 'pch=16' set the symbol as a circle points(theta, cos(theta), col=51:100, # add points (theta,cos(theta)) of more pch=15, cex=4) # different colors # 'pch=15' set the symbol as a square theta <- (1:400)*(pi/40) # 1st coordinate is in angle (radiants) plot(theta, sin(theta), type="l", col=2, # plot 'sin(theta)' as a red line lty=1, lwd=2) # 'lty=1' set the line type (solid) # 'lwd=2' set the line width (default is 1) lines(theta, cos(theta), col=3, # add a green line for cos(theta) lty=2, lwd=2) # 'lty=2' set the line type as dashed ############### Portability ####################################### Year<-c(1800,1850,1900,1950,2000) Carbon<-c(8,54,534,1630,6611) plot(Year,Carbon,type="l") mtext("Fossifuel data", 3, line=1, cex=1.5) # now Copy on the Clipboard from the File menu, # or save the file in a specific format, # again from the File menu jpeg("file.jpg") # Jpeg file saved in the directory plot(Year,Carbon,type="l") mtext("Fossifuel data", 3, line=1, cex=1.5) dev.off() pdf("file.pdf") # pdf file saved in the directory plot(Year,Carbon,type="l") mtext("Fossifuel data", 3, line=1, cex=1.5) dev.off() postscript("file.ps") # Postscript file plot(Year,Carbon,type="l") # as default a ps file is in landscape mode mtext("Fossifuel data", 3, line=1, cex=1.5) dev.off() postscript("file.ps", horizontal=F) # plot in portrait mode plot(Year,Carbon,type="l") mtext("Fossifuel data", 3, line=1, cex=1.5) dev.off() postscript("file.ps", horizontal=F, height=8,width=8) # plot in portrait mode and resized plot(Year,Carbon,type="l") mtext("Fossifuel data", 3, line=1, cex=1.5) dev.off() postscript("file.ps", horizontal=F, paper="special",height=8,width=8) # paper size specified by height and width plot(Year,Carbon,type="l") mtext("Fossifuel data", 3, line=1, cex=1.5) dev.off()